{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 第十一章 特征工程之数据预处理（11.5）"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 11.5 特征筛选：WOE值与IV值"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "11.5.3 WOE值与IV值的代码实现"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "1.数据分箱"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>年龄</th>\n",
       "      <th>是否违约</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>22</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>25</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>20</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>35</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>32</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>38</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>50</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>46</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   年龄  是否违约\n",
       "0  22     1\n",
       "1  25     1\n",
       "2  20     0\n",
       "3  35     0\n",
       "4  32     1\n",
       "5  38     0\n",
       "6  50     0\n",
       "7  46     1"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 首先通过如下代码构造数据：\n",
    "import pandas as pd\n",
    "data = pd.DataFrame([[22,1],[25,1],[20,0],[35,0],[32,1],[38,0],[50,0],[46,1]], columns=['年龄', '是否违约'])\n",
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    (19.97, 30.0]\n",
       "1    (19.97, 30.0]\n",
       "2    (19.97, 30.0]\n",
       "3     (30.0, 40.0]\n",
       "4     (30.0, 40.0]\n",
       "5     (30.0, 40.0]\n",
       "6     (40.0, 50.0]\n",
       "7     (40.0, 50.0]\n",
       "Name: 年龄, dtype: category\n",
       "Categories (3, interval[float64]): [(19.97, 30.0] < (30.0, 40.0] < (40.0, 50.0]]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 有了数据之后，根据“年龄”这一特征变量进行数据分箱，代码如下：\n",
    "data_cut = pd.cut(data['年龄'], 3)\n",
    "data_cut"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "2.统计各个分箱样本总数、坏样本数和好样本数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 统计总客户数\n",
    "cut_group_all = data['是否违约'].groupby(data_cut).count()\n",
    "# 统计违约客户\n",
    "cut_y = data['是否违约'].groupby(data_cut).sum()\n",
    "# 统计未违约客户\n",
    "cut_n = cut_group_all - cut_y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "年龄\n",
       "(19.97, 30.0]    3\n",
       "(30.0, 40.0]     3\n",
       "(40.0, 50.0]     2\n",
       "Name: 是否违约, dtype: int64"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 这里展示下cut_group_all的结果，如下所示：\n",
    "cut_group_all"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>总数</th>\n",
       "      <th>坏样本</th>\n",
       "      <th>好样本</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>年龄</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>(19.97, 30.0]</th>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>(30.0, 40.0]</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>(40.0, 50.0]</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "               总数  坏样本  好样本\n",
       "年龄                         \n",
       "(19.97, 30.0]   3    2    1\n",
       "(30.0, 40.0]    3    1    2\n",
       "(40.0, 50.0]    2    1    1"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 通过2.2.1节相关知识点将cut_group_all、cut_y、cut_n进行汇总，代码如下，这里我们将违约客户命名为“坏样本”，非违约客户命名为“好样本”。\n",
    "df = pd.DataFrame()  # 创建一个空DataFrame用来汇总数据\n",
    "df['总数'] = cut_group_all\n",
    "df['坏样本'] = cut_y\n",
    "df['好样本'] = cut_n\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "3.统计各分箱中坏样本比率和好样本比率"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>总数</th>\n",
       "      <th>坏样本</th>\n",
       "      <th>好样本</th>\n",
       "      <th>坏样本%</th>\n",
       "      <th>好样本%</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>年龄</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>(19.97, 30.0]</th>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>0.50</td>\n",
       "      <td>0.25</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>(30.0, 40.0]</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>0.25</td>\n",
       "      <td>0.50</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>(40.0, 50.0]</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0.25</td>\n",
       "      <td>0.25</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "               总数  坏样本  好样本  坏样本%  好样本%\n",
       "年龄                                     \n",
       "(19.97, 30.0]   3    2    1  0.50  0.25\n",
       "(30.0, 40.0]    3    1    2  0.25  0.50\n",
       "(40.0, 50.0]    2    1    1  0.25  0.25"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 计算坏样本%和好样本%\n",
    "df['坏样本%'] = df['坏样本'] / df['坏样本'].sum()\n",
    "df['好样本%'] = df['好样本'] / df['好样本'].sum()\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "4.计算WOE值"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>总数</th>\n",
       "      <th>坏样本</th>\n",
       "      <th>好样本</th>\n",
       "      <th>坏样本%</th>\n",
       "      <th>好样本%</th>\n",
       "      <th>WOE</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>年龄</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>(19.97, 30.0]</th>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>0.50</td>\n",
       "      <td>0.25</td>\n",
       "      <td>0.693147</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>(30.0, 40.0]</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>0.25</td>\n",
       "      <td>0.50</td>\n",
       "      <td>-0.693147</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>(40.0, 50.0]</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0.25</td>\n",
       "      <td>0.25</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "               总数  坏样本  好样本  坏样本%  好样本%       WOE\n",
       "年龄                                               \n",
       "(19.97, 30.0]   3    2    1  0.50  0.25  0.693147\n",
       "(30.0, 40.0]    3    1    2  0.25  0.50 -0.693147\n",
       "(40.0, 50.0]    2    1    1  0.25  0.25  0.000000"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import numpy as np\n",
    "df['WOE'] = np.log(df['坏样本%'] / df['好样本%'])\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "此外，我们在11.5.1节第一部分也讲过，在实际应用中，我们不希望WOE值出现无穷大（这样会导致之后计算的IV值也变为无穷大，丧失了IV值的意义），但是有的时候可能由于数据特殊性及分箱的原因，它还是出现了WOE值为无穷大的情况（某个分箱中只含有一种类别的数据），此时解决办法是当WOE值为无穷大时，将它替换为0，代码如下："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df.replace({'WOE': {np.inf: 0, -np.inf: 0}})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "5.计算IV值"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>总数</th>\n",
       "      <th>坏样本</th>\n",
       "      <th>好样本</th>\n",
       "      <th>坏样本%</th>\n",
       "      <th>好样本%</th>\n",
       "      <th>WOE</th>\n",
       "      <th>IV</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>年龄</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>(19.97, 30.0]</th>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>0.50</td>\n",
       "      <td>0.25</td>\n",
       "      <td>0.693147</td>\n",
       "      <td>0.173287</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>(30.0, 40.0]</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>0.25</td>\n",
       "      <td>0.50</td>\n",
       "      <td>-0.693147</td>\n",
       "      <td>0.173287</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>(40.0, 50.0]</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0.25</td>\n",
       "      <td>0.25</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "               总数  坏样本  好样本  坏样本%  好样本%       WOE        IV\n",
       "年龄                                                         \n",
       "(19.97, 30.0]   3    2    1  0.50  0.25  0.693147  0.173287\n",
       "(30.0, 40.0]    3    1    2  0.25  0.50 -0.693147  0.173287\n",
       "(40.0, 50.0]    2    1    1  0.25  0.25  0.000000  0.000000"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['IV'] = df['WOE'] * (df['坏样本%'] - df['好样本%'])\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.34657359027997264\n"
     ]
    }
   ],
   "source": [
    "iv = df['IV'].sum()\n",
    "print(iv)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "整理上面计算WOE值和IV值的内容，完整代码如下所示："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.34657359027997264\n"
     ]
    }
   ],
   "source": [
    "# 1.构造数据\n",
    "import pandas as pd\n",
    "data = pd.DataFrame([[22,1],[25,1],[20,0],[35,0],[32,1],[38,0],[50,0],[46,1]], columns=['年龄', '是否违约'])\n",
    "\n",
    "# 2.数据分箱\n",
    "data_cut = pd.cut(data['年龄'], 3)\n",
    "\n",
    "# 3.统计各个分箱样本总数、坏样本数和好样本数并汇总数据\n",
    "# 统计总客户数\n",
    "cut_group_all = data['是否违约'].groupby(data_cut).count()\n",
    "# 统计违约客户\n",
    "cut_y = data['是否违约'].groupby(data_cut).sum()\n",
    "# 统计未违约客户\n",
    "cut_n = cut_group_all - cut_y\n",
    "# 汇总基础数据\n",
    "df = pd.DataFrame()  # 创建一个空DataFrame用来汇总数据\n",
    "df['总数'] = cut_group_all\n",
    "df['坏样本'] = cut_y\n",
    "df['好样本'] = cut_n\n",
    "\n",
    "# 4.统计坏样本%和好样本%\n",
    "df['坏样本%'] = df['坏样本'] / df['坏样本'].sum()\n",
    "df['好样本%'] = df['好样本'] / df['好样本'].sum()\n",
    "\n",
    "# 5.计算WOE值\n",
    "import numpy as np\n",
    "df['WOE'] = np.log(df['坏样本%'] / df['好样本%'])\n",
    "df = df.replace({'WOE': {np.inf: 0, -np.inf: 0}})  # 替换可能存在的无穷大\n",
    "\n",
    "# 6.计算各个分箱的IV值\n",
    "df['IV'] = df['WOE'] * (df['坏样本%'] - df['好样本%'])\n",
    "\n",
    "# 7.汇总各个分箱的IV值，获得特征变量的IV值\n",
    "iv = df['IV'].sum()\n",
    "print(iv)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "11.5.4 案例实战：客户流失预警模型的IV值计算"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 将上面的内容首先定义为一个函数\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "def cal_iv(data, cut_num, feature, target):\n",
    "    # 1.数据分箱\n",
    "    data_cut = pd.cut(data[feature], cut_num)\n",
    "\n",
    "    # 2.统计各个分箱样本总数、坏样本数和好样本数\n",
    "    cut_group_all = data[target].groupby(data_cut).count()  # 总客户数\n",
    "    cut_y = data[target].groupby(data_cut).sum()  # 坏样本数\n",
    "    cut_n = cut_group_all - cut_y  # 好样本数\n",
    "    # 汇总基础数据\n",
    "    df = pd.DataFrame()  # 创建一个空DataFrame用来汇总数据\n",
    "    df['总数'] = cut_group_all\n",
    "    df['坏样本'] = cut_y\n",
    "    df['好样本'] = cut_n\n",
    "\n",
    "    # 3.统计坏样本%和好样本%\n",
    "    df['坏样本%'] = df['坏样本'] / df['坏样本'].sum()\n",
    "    df['好样本%'] = df['好样本'] / df['好样本'].sum()\n",
    "\n",
    "    # 4.计算WOE值\n",
    "    df['WOE'] = np.log(df['坏样本%'] / df['好样本%'])\n",
    "    df = df.replace({'WOE': {np.inf: 0, -np.inf: 0}}) \n",
    "\n",
    "    # 5.计算各个分箱的IV值\n",
    "    df['IV'] = df['WOE'] * (df['坏样本%'] - df['好样本%'])\n",
    "\n",
    "    # 6.汇总各个分箱的IV值，获得特征变量的IV值\n",
    "    iv = df['IV'].sum()\n",
    "    \n",
    "    print(iv)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>账户资金（元）</th>\n",
       "      <th>最后一次交易距今时间（天）</th>\n",
       "      <th>上月交易佣金（元）</th>\n",
       "      <th>本券商使用时长（年）</th>\n",
       "      <th>是否流失</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>22686.5</td>\n",
       "      <td>297</td>\n",
       "      <td>149.25</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>190055.0</td>\n",
       "      <td>42</td>\n",
       "      <td>284.75</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>29733.5</td>\n",
       "      <td>233</td>\n",
       "      <td>269.25</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>185667.5</td>\n",
       "      <td>44</td>\n",
       "      <td>211.50</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>33648.5</td>\n",
       "      <td>213</td>\n",
       "      <td>353.50</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    账户资金（元）  最后一次交易距今时间（天）  上月交易佣金（元）  本券商使用时长（年）  是否流失\n",
       "0   22686.5            297     149.25           0     0\n",
       "1  190055.0             42     284.75           2     0\n",
       "2   29733.5            233     269.25           0     1\n",
       "3  185667.5             44     211.50           3     0\n",
       "4   33648.5            213     353.50           0     1"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 有了上面的自动计算IV值的函数后，通过如下代码来读取客户流失预警模型中的相关数据：\n",
    "data = pd.read_excel('股票客户流失.xlsx')\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.15205722409339645\n"
     ]
    }
   ],
   "source": [
    "# 我们利用刚刚编好的函数进行第一个特征变量“账户资金（元）”的IV值计算，代码如下：\n",
    "cal_iv(data, 4, '账户资金（元）', '是否流失')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "账户资金（元）的IV值为：\n",
      "0.15205722409339645\n",
      "最后一次交易距今时间（天）的IV值为：\n",
      "0.2508468300174099\n",
      "上月交易佣金（元）的IV值为：\n",
      "0.30811632146662304\n",
      "本券商使用时长（年）的IV值为：\n",
      "0.6144219248359752\n"
     ]
    }
   ],
   "source": [
    "for i in data.columns[:-1]:\n",
    "    print(i + '的IV值为：')\n",
    "    cal_iv(data, 4, i, '是否流失')  # 调用函数"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
